permutation.test <- function(treatment, outcome, n){
  original <- diff(tapply(outcome, treatment, mean))
  original <- original[complete.cases(original)]
  distribution=c()
  result=0
  for(i in 1:n){
    dist <- diff(by(outcome, treatment[sample(length(treatment), replace=FALSE)], mean))
    distribution[i] <- dist[complete.cases(dist)]
  }
  # result=sum(abs(distribution) >= abs(original))/(n)
  return(distribution)
}
credit_score <- dbSendQuery(con, "SELECT * FROM credit_score")
credit_score <<- as.data.frame(dbFetch(credit_score))
for (col in 1:ncol(credit_score)) {
  colnames(credit_score)[col] <- tolower(colnames(credit_score)[col])
}
credit_score <- subset(credit_score, select = -c(index, id))
categories <- c("sex", "education", "marriage", "default")
for (col in categories) {
  credit_score[, col] <- as.factor(credit_score[, col])
}
numerical <- names(subset(credit_score, select = -c(sex, education, marriage, default)))
for (n in numerical) {
  credit_score[, n] <- as.numeric(unlist(credit_score[, n]))
}
credit_score$education = factor(credit_score$education, levels=c(6, 5, 4, 3, 2, 1, 0), ordered=TRUE)
credit_score <- na.omit(credit_score)

First, I have to admit that this is a synthetical dataset, which means that there are no missing values, outliers, errors or any other mistakes, so these examinations will be skipped.

ggplotly(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Note: distribution of limit_bal is probably not normal. We will use this fact later.

credit_score$age <- log(credit_score$age)
p2 <- ggplot(credit_score, aes(x = age)) + geom_histogram()
ggplotly(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p3 <- ggplot(credit_score, aes(x=sex, y=limit_bal)) + geom_boxplot()
ggplotly(p3)
p4 <- ggplot(credit_score, aes(x=education, y=limit_bal, fill=education)) + geom_boxplot()
ggplotly(p4)
p5 <- ggplot(credit_score, aes(x=marriage, y=limit_bal, fill=marriage)) + geom_boxplot()
ggplotly(p5)
t <- (table(credit_score$default,credit_score$sex))
t <- as.data.frame(t)
colnames(t) <- c('default', 'sex', 'cnt')
p6 <-
  ggplot(t, aes(x = default, y = cnt, fill = sex)) + geom_bar(stat = 'identity',  position = position_dodge())
ggplotly(p6)
t <- (table(credit_score$default, credit_score$education))
t <- as.data.frame(t)
colnames(t) <- c('default', 'education', 'cnt')

p6 <-
  ggplot(t, aes(x = default, y = cnt, fill=education)) + geom_bar(stat = 'identity',  position = position_dodge())
ggplotly(p6)
t <- (table(credit_score$default, credit_score$marriage))
t <- as.data.frame(t)
colnames(t) <- c('default', 'marriage', 'cnt')

p7 <-
  ggplot(t, aes(x = default, y = cnt, fill=marriage)) + geom_bar(stat = 'identity',  position = position_dodge())
ggplotly(p7)

Let’s test two hypothesis: - Are the mean credit limits (limit_bal) value for two groups default = 0 (didn’t returned the credit) and default = 1 equal to each other? - Are the distributions of the limit_bal for these two groups also equal to each other?

In order to answer these and the following questions I will calculate confidence intervals. Here and further I will use permutation non-parametric test in order to compare distributions

t.test(limit_bal ~ default, data = credit_score)
## 
##  Welch Two Sample t-test
## 
## data:  limit_bal by default
## t = 28.952, df = 11982, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  44740.91 51239.23
## sample estimates:
## mean in group 0 mean in group 1 
##        178099.7        130109.7
wilcox.test(limit_bal ~ default, data = credit_score)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  limit_bal by default
## W = 95786286, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
hist1 = permutation.test(credit_score$default, credit_score$limit_bal, 1000)
p8 <- ggplot(as.data.frame(hist1), aes(x=hist1)) + geom_histogram()
ggplotly(p8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

These results are obviously practically significant.

Now, lets test another pair of hypothesis: - Are the mean ages and their distributions for these two groups equal to each other?

t.test(age ~ default, credit_score)
## 
##  Welch Two Sample t-test
## 
## data:  age by default
## t = -1.2343, df = 10171, p-value = 0.2171
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.011595106  0.002634737
## sample estimates:
## mean in group 0 mean in group 1 
##         3.53598         3.54046
wilcox.test(age ~ default, credit_score)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by default
## W = 76966880, p-value = 0.3725
## alternative hypothesis: true location shift is not equal to 0
hist2 <- permutation.test(credit_score$default, credit_score$age, 1000)
p9 <- ggplot(as.data.frame(hist2), aes(x=hist2)) + geom_histogram()
ggplotly(p9)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#aovp(limit_bal~education, data=credit_score)
subset <- credit_score %>% filter(education==2|education==1)

The result we received tells us that statistically, mean ages are different, but from the confidence interval value we can see that this difference is hardly practically signigicant.

Now let’s see if the gender composition for the two groups differ.

good <- filter(credit_score, default == 0)
bad <- filter(credit_score, default == 1)
c(ngoodmen, total_good, nbadmen, total_bad) %<-% c(table(good$sex)[1], sum(table(good$sex)), table(bad$sex)[1], sum(table(bad$sex)))
diffscoreci(ngoodmen, total_good, nbadmen, total_bad, conf.level = 0.95)
## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  -0.06057240 -0.03366348
library("ggpubr")
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
ggboxplot(credit_score, x = "education", y = "limit_bal", 
          color = "education", 
          order = c(6, 5, 4, 3, 2, 1),
          ylab = "Limit Balance", xlab = "Education Level")

p8 <- ggplot(credit_score, aes(x=education, y=limit_bal)) + geom_boxplot()
ggplotly(p8)

Let’s try and test ANOVA model for comparing means of limit_bal dependent on education. ANOVA assumes homoscedasticity as well as normality of distribution. Let’s build a model and view a QQ-plot, it will help us understand if the target is normally distributed.

res.aov = aov(limit_bal ~ education, data=credit_score[sample(5000), ])
summary(res.aov)
##               Df   Sum Sq   Mean Sq F value Pr(>F)    
## education      6 5.72e+12 9.534e+11    60.1 <2e-16 ***
## Residuals   4993 7.92e+13 1.586e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = limit_bal ~ education, data = credit_score[sample(5000), ])
## 
## $education
##           diff         lwr       upr     p adj
## 5-6 -63333.333 -214993.324  88326.66 0.8819088
## 4-6 -31000.000 -193637.221 131637.22 0.9977941
## 3-6 -78442.822 -210421.811  53536.17 0.5800761
## 2-6 -60390.384 -191963.329  71182.56 0.8261794
## 1-6   3480.945 -128142.158 135104.05 1.0000000
## 0-6  85000.000 -309024.214 479024.21 0.9956158
## 4-5  32333.333  -89938.860 154605.53 0.9868928
## 3-5 -15109.489  -92038.527  61819.55 0.9973918
## 2-5   2942.950  -73287.383  79173.28 0.9999998
## 1-5  66814.278   -9502.592 143131.15 0.1315499
## 0-5 148333.333 -230816.644 527483.31 0.9110497
## 3-4 -47442.822 -144232.232  49346.59 0.7767505
## 2-4 -29390.384 -125625.391  66844.62 0.9725368
## 1-4  34480.945  -61822.625 130784.51 0.9406114
## 0-4 116000.000 -267672.801 499672.80 0.9738916
## 2-3  18052.439    2927.489  33177.39 0.0079171
## 1-3  81923.767   66368.536  97479.00 0.0000000
## 0-3 163442.822 -208272.668 535158.31 0.8536353
## 1-2  63871.328   52254.449  75488.21 0.0000000
## 0-2 145390.384 -226181.133 516961.90 0.9109901
## 0-1  81519.055 -290070.225 453108.34 0.9951924
plot(res.aov, 1)

plot(res.aov, 2)
## Warning: not plotting observations with leverage one:
##   4188

As we can see, distribution is far from normal. Let’s validate this hypothesis using Shapiro-Wilk test.

aov_residuals <- res.aov[['residuals']]
shapiro.test(x=aov_residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  aov_residuals
## W = 0.92408, p-value < 2.2e-16

As expected, H_0 have to be rejected: distribution of limit_bal is not normal. That means, that we can not rely on the ANOVA model conclusions and we need to use other non-parametric criteria, for example, permutation test. Education feature is categorical (not binary) variable, which means that we have to incorporate multiple testing correction (Max-T method) in order to receive H_0 distribution for differences of means.

But first, let’s try to calculate log and square root of the limit_bal, hoping that it will help to restore normality. Calculations are hidden, distribution didn’t become normal anyway.

credit_score$limit_bal_log = log(credit_score$limit_bal)
aov.log = aov(default~limit_bal_log, credit_score)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
qqPlot(x=credit_score$limit_bal_log)

credit_score$limit_bal_sqrt = sqrt(credit_score$limit_bal)
qqPlot(x=credit_score$limit_bal_sqrt)

edu <- expand.grid(1:6, 1:6)
selection <-
     subset(credit_score, education=="1"|education=="2", c(education, limit_bal))
outcome <- selection$limit_bal
treatment <- droplevels(selection$education)
#d <<- data.frame(tmp1)
#d <- rbind(d, tmp)
#colnames(d) <- nms
dl <- list()
foo <- function(edu) {
    p <-  c(edu['Var1'], edu['Var2'], edu['Var3'])
    selection <-
     subset(credit_score, education==p[1]|education==p[2], c(education, limit_bal))
      outcome <- selection$limit_bal
      treatment <- droplevels(selection$education)
      v <- permutation.test(treatment=subset$education, outcome=subset$limit_bal, n=1000)
      dl[p[3]] <- list(v)
    #cbind(d, )
}
d <- apply(edu, MARGIN=1, foo)
d <- t(matrix(unlist(d), ncol=nrow(edu), nrow=1000))
distrib <- apply(d, MARGIN=2, function(x) max(abs(x)))
p8 <- ggplot(as.data.frame(distrib), aes(x=distrib)) + geom_histogram()
ggplotly(p8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

That means that men do not return their credits slightly more often (3-6%) than women.

Now, let’s see if the education level impacts default rate. First, calculate table which will show us the sizes of default and no-default groups for each education level, secondly, let’s see how do these sizes differ from the expected ones, next calculate the value of the statistical criteria.

crosstab <- table(credit_score$education, credit_score$default)
crosstab
##    
##         0     1
##   6    43     8
##   5   262    18
##   4   116     7
##   3  3680  1237
##   2 10700  3330
##   1  8549  2036
##   0    14     0
crosstab - chisq.test(crosstab)$expected
## Warning in chisq.test(crosstab): Chi-squared approximation may be incorrect
##    
##             0         1
##   6    3.2812   -3.2812
##   5   43.9360  -43.9360
##   4   20.2076  -20.2076
##   3 -149.3596  149.3596
##   2 -226.5640  226.5640
##   1  305.4020 -305.4020
##   0    3.0968   -3.0968
chisq.test(crosstab)
## Warning in chisq.test(crosstab): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  crosstab
## X-squared = 163.22, df = 6, p-value < 2.2e-16
assocstats(crosstab)
##                     X^2 df P(> X^2)
## Likelihood Ratio 184.71  6        0
## Pearson          163.22  6        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.074 
## Cramer's V        : 0.074

From this we can conclude that education statistically and practically significant impacts both default and limit_bal, however, these dependencies are not linear (and not uniform as for default)

Finally, let’s see if the marriage category impacts the default category.

marriage_crosstab <- table(credit_score$marriage, credit_score$default)
marriage_crosstab
##    
##         0     1
##   0    49     5
##   1 10453  3206
##   2 12623  3341
##   3   239    84
assocstats(marriage_crosstab)
##                     X^2 df   P(> X^2)
## Likelihood Ratio 36.609  3 5.5663e-08
## Pearson          35.662  3 8.8259e-08
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.034 
## Cramer's V        : 0.034

For both variables (education and marriage) we see that they statitically significant impact the default category. However, the contigency coefficients (which tells us how strong the features are correlated) are relatively small.